Creating data visualisation beyond default
In this take-home exercise, I will…
Before I get started, it is important for us to ensure that the required R packages have been installed. If yes, we will load the R packages. If they have yet to be installed, I will install the R packages and load them onto R environment.
The chunk code below will do the trick.
packages = c('tidyverse','psych','plotly', 'ggiraph')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
The code chunk below imports Participants.csv from the data
folder, into R by using read_csv()
of readr and
save it as an tibble dataframe called participants_data.
# emp_data <- read_csv("data/Employers.csv")
job_data <- read_csv("data/Jobs.csv")
fin_data <- read_csv("data/FinancialJournal.csv")
participants_data <- read_csv("data/Participants.csv")
pub_data <- read_csv("data/Pubs.csv")
restaurant_data <- read_csv("data/Restaurants.csv")
checkin_data <- read_csv("data/CheckinJournal.csv")
glimpse(checkin_data)
Rows: 2,100,635
Columns: 4
$ participantId <dbl> 619, 15, 23, 699, 876, 23, 154, 359, 509, 556,~
$ timestamp <dttm> 2022-03-01 05:35:00, 2022-03-01 05:50:00, 202~
$ venueId <dbl> 1798, 1798, 894, 1798, 1804, 895, 446, 893, 18~
$ venueType <chr> "Pub", "Pub", "Pub", "Pub", "Restaurant", "Res~
table(fin_data$category)
Education Food Recreation RentAdjustment
4060 974100 355359 131
Shelter Wage
14103 508577
des <- describe(fin_data, fast = TRUE)
print(des, digits=5)
vars n mean sd min max
participantId 1 1856330 480.81309 295.8256 0.000 1010.000
timestamp 2 1856330 NaN NA Inf -Inf
category 3 1856330 NaN NA Inf -Inf
amount 4 1856330 19.92172 108.0637 -1562.726 4096.526
range se
participantId 1010.000 0.21712
timestamp -Inf NA
category -Inf NA
amount 5659.252 0.07931
ggplot(data=checkin_data, aes(x='participantId')) +
geom_bar() +
facet_wrap(~venueType)
merged_dataframe <- merge(x = participants_data, y = checkin_data, all.x = TRUE)
ggplotly(ggplot(data = merged_dataframe,
aes(x=age, fill= venueType)) +
geom_histogram(bins=20, color='black') +
ggtitle("Histogram of Participants' age, filled by Education Level") +
xlab('Age') +
ylab('Count') +
labs(fill="Education Level") +
scale_x_continuous(breaks = seq(10, 70, by = 2)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)))
ggplot(data = merged_dataframe,
aes(y=age, x = haveKids)) +
geom_boxplot() +
ggtitle("Box Plot of Participants' age across various Interest Group, grouped by Education") +
xlab('Have Kids?') +
ylab('Age') +
facet_grid(educationLevel~venueType) +
coord_flip()
merged_dataframe %>%
mutate(`Education Level` = fct_relevel(educationLevel,"Low","HighSchoolOrCollege","Graduate","Bachelors")) %>%
ggplot(aes(x = `Education Level`,
fill = venueType)) +
geom_bar(position = 'fill') +
geom_text(stat = 'count',
aes(label = stat(count)),
position = position_fill(vjust=0.9)) +
scale_y_continuous(breaks = seq(0,1, by = 0.1),
labels = scales::percent) +
scale_x_discrete(labels = c("Low", "High School Or College", "Bachelors", "Graduate")) +
labs(y = 'Percentage\nof\nParticipants',
title = "Percentage Distribution of Participants' Education Level",
subtitle ='With respect to where Participants are travelling to',
fill ='Venue') +
theme(axis.title.y = element_text(angle = 0),
axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = 'grey'))
checkin_data %>%
mutate(venueType) %>%
ggplot(aes(x=))
ggplot(data=checkin_data, aes(x='participantId')) +
geom_bar() +
facet_wrap(~venueType)
participants_data %>% mutate(Education Level =
fct_relevel(educationLevel,“Low”,“HighSchoolOrCollege”,“Graduate”,“Bachelors”))
%>% ggplot(aes(x = Education Level, fill = haveKids)) +
geom_bar(position = ‘fill’) + geom_text(stat = ‘count’, aes(label =
stat(count)), position = position_fill(vjust=0.9)) +
scale_y_continuous(breaks = seq(0,1, by = 0.1), labels =
scales::percent) + scale_x_discrete(labels = c(“Low”, “High School Or
College”, “Bachelors”, “Graduate”)) + labs(y = ‘Percentage’, title =
“Percentage Distribution of Participants’ Education Level”, subtitle
=‘With respect to having kids or not’, fill =‘Have Kids?’) +
theme(axis.title.y = element_text(angle = 0), axis.ticks.x =
element_blank(), panel.background = element_blank(), axis.line =
element_line(color = ‘grey’))
glimpse(merged_rest)
Rows: 447,602
Columns: 9
$ venueId <dbl> 445, 445, 445, 445, 445, 445, 445, 445, 445, 4~
$ participantId <dbl> 993, 164, 868, 247, 52, 223, 764, 546, 137, 16~
$ timestamp <dttm> 2022-08-19 14:10:00, 2023-02-09 13:30:00, 202~
$ venueType <chr> "Restaurant", "Restaurant", "Restaurant", "Res~
$ foodCost <dbl> 5.15, 5.15, 5.15, 5.15, 5.15, 5.15, 5.15, 5.15~
$ maxOccupancy <dbl> 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71~
$ location <chr> "POINT (631.5130723031391 2001.4772026036535)"~
$ buildingId <dbl> 304, 304, 304, 304, 304, 304, 304, 304, 304, 3~
$ Month_Yr <chr> "2022-08", "2023-02", "2022-03", "2023-03", "2~
table(merged_rest$venueId)
445 446 447 448 449 895 896 897 898 899 1345
8358 17890 34295 33109 41693 30355 20784 31276 12801 28898 26330
1346 1347 1348 1349 1801 1802 1803 1804 1805
6326 9468 9625 5077 48264 16049 12124 15685 39195
merged_rest$venueId = as.factor(merged_rest$venueId)
p <- ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count",
position = position_stack(reverse = TRUE),
aes(text = paste0("Month_Yr: ", Month_Yr, "<br>",
"Max Occupancy: ", maxOccupancy,"<br>",
"Building: ", buildingId,"<br>",
"Food Cost: ", foodCost))) +
coord_flip() +
# facet_wrap(~Month_Yr) +
scale_y_continuous(breaks = seq(0,50000, by = 5000)) +
labs(x= 'Restaurant ID',
y = 'Count of Customers to Restaurants',
title = "XXX",
subtitle = 'YYY',
fill = 'Year-Month') +
theme(axis.title.y = element_text(angle = 0),
#axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color= 'grey'))
ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count",
position = position_stack(reverse = TRUE),
aes(text = paste0("Month_Yr: ", Month_Yr, "<br>",
"Max Occupancy: ", maxOccupancy,"<br>",
"Building: ", buildingId,"<br>",
"Food Cost: ", foodCost))) +
coord_flip() +
facet_wrap(~Month_Yr) +
scale_y_continuous(breaks = seq(0,50000, by = 5000)) +
labs(x= 'Restaurant ID',
y = 'Count of Customers to Restaurants',
title = "XXX",
subtitle = 'YYY',
fill = 'Year-Month') +
theme(axis.title.y = element_text(angle = 0),
#axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color= 'grey'),
panel.margin.x = unit(1,"lines"),
panel.margin.y = unit(2,"lines"))
ggplotly(p, tooltip = c("x","text"))
Rows: 355,378
Columns: 9
$ venueId <dbl> 442, 442, 442, 442, 442, 442, 442, 442, 442, 4~
$ participantId <dbl> 50, 948, 141, 952, 113, 583, 115, 830, 165, 83~
$ timestamp <dttm> 2022-03-25 13:45:00, 2022-06-26 08:35:00, 202~
$ venueType <chr> "Pub", "Pub", "Pub", "Pub", "Pub", "Pub", "Pub~
$ hourlyCost <dbl> 8.281103, 8.281103, 8.281103, 8.281103, 8.2811~
$ maxOccupancy <dbl> 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64~
$ location <chr> "POINT (964.4380231713202 3991.603473784208)",~
$ buildingId <dbl> 556, 556, 556, 556, 556, 556, 556, 556, 556, 5~
$ Month_Yr <chr> "2022-03", "2022-06", "2022-07", "2022-03", "2~
table(merged_pub$venueId)
442 443 444 892 893 894 1342 1343 1344 1798 1799
21703 19807 17592 23603 25255 20253 50400 31162 54174 31212 28589
1800
31628
merged_pub$venueId = as.factor(merged_pub$venueId)
aes(text = paste0(“Month_Yr:”, Month_Yr, “
”, “Hourly Cost:”,
hourlyCost))
, tooltip = “text”
p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count",
position = position_stack(reverse = TRUE),
aes(text = paste0("Month_Yr: ", Month_Yr, "<br>",
"Max Occupancy: ", maxOccupancy,"<br>",
"Building: ", buildingId,"<br>",
"Hourly Cost: ", hourlyCost))) +
coord_flip() +
scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
labs(x= 'Pub ID',
y = 'Count of Customers to Pubs',
title = "XXX",
subtitle = 'YYY',
fill = 'Year-Month') +
theme(axis.title.y = element_text(angle = 0),
#axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color= 'grey'))
ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count",
position = position_stack(reverse = TRUE),
aes(text = paste0("Month_Yr: ", Month_Yr, "<br>",
"Max Occupancy: ", maxOccupancy,"<br>",
"Building: ", buildingId,"<br>",
"Hourly Cost: ", hourlyCost))) +
coord_flip() +
facet_wrap(~Month_Yr) +
scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
labs(x= 'Pub ID',
y = 'Count of Customers to Pubs',
title = "XXX",
subtitle = 'YYY',
fill = 'Year-Month') +
theme(axis.title.y = element_text(angle = 0),
#axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color= 'grey'))
ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count",
position = position_stack(reverse = TRUE),
aes(text = paste0("Month_Yr: ", Month_Yr, "<br>",
"Max Occupancy: ", maxOccupancy,"<br>",
"Building: ", buildingId,"<br>",
"Hourly Cost: ", hourlyCost))) +
coord_flip() +
scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
labs(x= 'Pub ID',
y = 'Count of Customers to Pubs',
title = "XXX",
subtitle = 'YYY',
fill = 'Year-Month') +
theme(axis.title.y = element_text(angle = 0),
#axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color= 'grey'))
ggplotly(p, tooltip = c("x","text"))
ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
geom_bar(stat="count") +
coord_flip()